Document last updated 2021-04-21 16:15:30 by Benjamin Meyer (ben@kenaiwatershed.org)
This draft document contains a data query and summary performed to generate average hardness values for select sites included in the Kenai River Baseline Water Quality Monitoring Program (KRBWQM).
Notes:
A table summarising average hardness values for select KRBWQM sites was provided to B Meyer in Nov. 2020 as part of an initial draft report on Copper and Zinc metals concentrations in the Kenai River. Summary methods and data origin were undocumented.
To verify the origin of this data, the analysis here instead uses data directly from the EPA repository at waterqualitydata.us where available (2000 - 2013), as well as from results found compiled on the Kenai Watershed Forum server for results 2014-2019.
KRBWQM data in the EPA repository 2000 - 2013 is sourced from the following query: https://www.waterqualitydata.us/portal/#countrycode=US&statecode=US%3A02&countycode=US%3A02%3A122&sampleMedia=water&sampleMedia=Water&characteristicType=Inorganics%2C%20Major%2C%20Metals&characteristicType=Inorganics%2C%20Minor%2C%20Metals&mimeType=csv&dataProfile=narrowResult
KRBWQM data found on the Kenai Watershed Forum server is found in the “data” project of this project repository at “data/Compiled_KRBWQM_data_2014_2019”
Note: as soon as feasible, all water quality data 2014 - present will also be submitted to the EPA repository.
Read in Calcium and Magnesium data from EPA repository
# read in metals data 2000 - 2013
# data sourced from the following query at EPA water quality repository:
# https://www.waterqualitydata.us/portal/#countrycode=US&statecode=US%3A02&countycode=US%3A02%3A122&sampleMedia=water&sampleMedia=Water&characteristicType=Inorganics%2C%20Major%2C%20Metals&characteristicType=Inorganics%2C%20Minor%2C%20Metals&mimeType=csv&dataProfile=narrowResult
epa_dat <- read.csv("data/waterqualitydata_epa_repo_all_metals.csv")
# choose variables to retain
vars <- as.character(c(
"OrganizationFormalName",
"ActivityIdentifier",
"ActivityStartDate",
"ActivityStartTime.Time",
"MonitoringLocationIdentifier",
"ResultIdentifier",
"ResultDetectionConditionText",
"CharacteristicName",
"ResultSampleFractionText",
"ResultMeasureValue",
"ResultMeasure.MeasureUnitCode",
"MeasureQualifierCode",
"ResultStatusIdentifier",
"ResultValueTypeName"))
# rename variables
new_vars <- c("agency",
"activity_id",
"date",
"time",
"location_id",
"result_id",
"detection_qualifier_text",
"parameter",
"substance_condition",
"val",
"unit",
"detection_qualifier_code",
"result_status",
"result_type")
# retain selected columns
epa_dat <- epa_dat %>%
select(all_of(vars))
# rename columns
colnames(epa_dat) <- new_vars
# retain ca and mg data only
epa_dat <- epa_dat %>%
filter(parameter %in% c("Calcium","Magnesium"))
Read in site data for Kenai River region
# we want ca and mg data for KRBWQM sites only
# examine sites
# query available sites in the region at EPA repository:
# https://www.waterqualitydata.us/portal/#bBox=-151.413081%2C60.292340%2C-149.215768%2C60.715224&mimeType=csv
# (Used bounding box)
epa_krbwqm_sites <- read.csv("data/waterqualitydata_epa_repo_sites.csv")
# retain potentially useful columns
site_vars <- c("OrganizationFormalName",
"MonitoringLocationIdentifier",
"MonitoringLocationName",
"MonitoringLocationTypeName",
"MonitoringLocationDescriptionText",
"HUCEightDigitCode",
"DrainageAreaMeasure.MeasureValue",
"DrainageAreaMeasure.MeasureUnitCode",
"LatitudeMeasure",
"LongitudeMeasure",
"HorizontalCollectionMethodName",
"HorizontalCoordinateReferenceSystemDatumName",
"VerticalMeasure.MeasureValue",
"VerticalMeasure.MeasureUnitCode")
# rename variables
site_new_vars <- c("agency",
"location_id",
"location_name",
"location_type",
"location_description",
"huc",
"drainage_area",
"drainage_area_unit",
"lat",
"long",
"horizontal_collection_method",
"horizontal_coords_system",
"elevation",
"elevation_unit")
# retain selected columns
epa_krbwqm_sites <- epa_krbwqm_sites %>%
select(all_of(site_vars))
# rename columns
colnames(epa_krbwqm_sites) <- site_new_vars
Join sites with Ca/Mg data to site wqx data
epa_dat <- left_join(epa_dat,epa_krbwqm_sites, by = c("location_id","agency"))
What are all the different agencies that have collected Ca/Mg data in the Kenai Peninsula Borough?
unique(epa_dat$agency)
## [1] "USGS Alaska Water Science Center"
## [2] "National Park Service Water Resources Division"
## [3] "Kenai Watershed Forum(Volunteer)*"
## [4] "EPA Region 10 Superfund Historical Data"
## [5] "Seldovia Village Tribe"
Retain only Kenai Watershed Forum’s stream/river/lake data
epa_dat <- epa_dat %>%
filter(agency == "Kenai Watershed Forum(Volunteer)*")
Modify column classes
# modify column classes
epa_dat <- epa_dat %>%
transform(date = mdy(date),
time = hms::as_hms(time))
Let’s plot our data on a leaflet map to assess location data and assess if further site name QA/QC is needed
Create leaflet map:
leaflet(data = epa_dat) %>%
addTiles() %>%
addMarkers(~long,
~lat,
popup = epa_dat$location_description)
#leaflet() %>%
# addTiles() %>% # Add default OpenStreetMap map tiles
#fitBounds(-150, 60.04,-149.0, 60.02) %>%
#setView(-150.210169, 60.487694, zoom = 8) %>%
# addMarkers(lng = all.dat$Longitude, lat = all.dat$Latitude,
# popup = paste("SiteID = ", all.dat$SiteID, "<br>",
# "Data Source = ", all.dat$SourceName, "<br>",
# "Start Year = ", all.dat$startYear, "<br>",
# "End Year = ", all.dat$endYear, "<br>",
# "Total Years of Data = ", all.dat$totYears, "<br>"))
What are the site names remaining in the EPA data set?
unique(epa_dat$location_description)
## [1] "No_Name_Creek" "Kenai_Lake_Bridge"
## [3] "Poachers_Cove" "Russian_River"
## [5] "Pillars" "Soldotna_Creek"
## [7] "Moose_River" "Jims_Landing"
## [9] "Kenai_River_near_Beaver_creek" "City_of_Kenai_Docks"
## [11] "Killey_River" "Morgans_Landing"
## [13] "Skilak_Lake_Outlet" "Upstream_Dow_Island"
## [15] "Swiftwater_Park" "Soldotna_Bridge"
## [17] "Bings_Landing" "Slikok_Creek"
## [19] "Funny_River" "Cunningham_Park"
## [21] "Beaver_Creek" "Juneau_Creek"
Looks good!
Read in data from Kenai Watershed Forum server
# read in metals data 2014 - 2019 from compiled file found on Kenai Watershed Forum server
kwf_dat <- read_excel("data/Compiled_KRBWQM_data_2014_2019.xlsx", sheet = "Master")
# create format to match data imported from EPA repository
kwf_dat <- kwf_dat %>%
filter(Parameter %in% c("Calcium","Magnesium")) %>%
select(-Year,-Season,-ChannelType,-Lab,-TestType) %>%
rename(date = Date,
location_description = Site,
parameter = Parameter,
val = Result,
unit = Units,
detection_qualifier_code = Code,
duplicate = Duplicate) %>%
mutate(agency = "Kenai Watershed Forum(Volunteer)*")
# address ND in val column
kwf_dat <- kwf_dat %>%
mutate(detection_qualifier_text = ifelse(val == "ND","Not Detected","")) %>%
mutate(val = na_if(val,"ND")) %>%
# transform column classes
transform(date = as.Date(date),
val = as.double(val))
What are the site names in our KWF data set?
unique(kwf_dat$location_description)
## [1] "RM 19 - Slikok Creek" "RM 21 - Soldotna Bridge"
## [3] "RM 22 - Soldotna Creek" "RM 23 - Swiftwater Park"
## [5] "RM 6.5 - Cunningham Park" "RM 10 - Beaver Creek"
## [7] "RM 10.1 - Kenai River" "RM 12.5 - Pillars"
## [9] "RM 18 - Poachers Cove" "RM 70 - Jims Landing"
## [11] "RM 74 - Russian River" "RM 82 - Kenai Lake Bridge"
## [13] "RM 79.5 - Juneau Creek" "RM 30 - Funny River"
## [15] "RM 31 - Morgans Landing" "RM 36 - Moose River"
## [17] "RM 40 - Bing's Landing" "RM 43 - US Dow Island"
## [19] "RM 44 - Mouth of Killey River" "RM 50 - Skilak Lake Outflow"
## [21] "RM 0 - No Name Creek" "RM 1.5 - Kenai Docks"
Issue: Site names in the EPA data set are formatted differently than those in the KWF data set. Solution: use a matching table and left_join to resolve the differences.
Perform data munging to generate matching table and resolve site name differences
# export EPA site names csv
write.csv(unique(epa_dat$location_description),"output/site_names_table/epa_site_names.csv", row.names = F)
# export KWF site names csv
write.csv(unique(kwf_dat$location_description),"output/site_names_table/kwf_site_names.csv", row.names = F)
# use the two csv files to manually match site names in a new file, "final_site_names.csv"
# replace kwf location_description column with epa location_description column
site_match_table <- read.csv("output/site_names_table/final_site_names.csv") %>%
rename(location_description = kwf_site_names)
kwf_dat <- left_join(kwf_dat,site_match_table, by = "location_description") %>%
select(-location_description) %>%
rename(location_description = epa_site_names)
# join, arrange, and fill in
dat <- bind_rows(epa_dat,kwf_dat) %>%
arrange(location_description) %>%
fill(river_mile, .direction = "up") %>%
left_join(site_match_table) %>%
select(-epa_site_names,-waterbody,-river_mile)
# provide waterbody type to all sites not already designated
site_match_table <- site_match_table %>%
select(-location_description) %>%
rename("location_description" = "epa_site_names")
dat <- left_join(dat,site_match_table)
Are parameter units consistent?
unique(dat$unit)
## [1] "mg/l" "mg/L" "ug/L" "ug/l"
Resolve unit magnitude and abbreviation convention inconsistencies. Make all units mg/L
dat <- dat %>%
# resolve names
mutate(unit = ifelse(unit == "mg/l","mg/L",unit)) %>%
mutate(unit = ifelse(unit == "ug/l","ug/L",unit)) %>%
# resolve magnitudes
mutate(val = ifelse(unit == "ug/L",val/1000,val)) %>%
mutate(unit = ifelse(unit == "ug/L","mg/L",unit))
What is the range of our concentration values?
summary(dat$val)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 1.46 7.56 11.59 11.26 582.00 6
Do we have some a consistent range of concentration values? Lets see if we can identify them
dat %>%
ggplot(aes(date,val,color = parameter)) +
geom_point() +
facet_wrap(. ~ location_description)
p <- dat %>%
filter(location_description == "City_of_Kenai_Docks") %>%
ggplot(aes(date,val,color = parameter)) +
geom_point() +
ggtitle("City of Kenai Docks")
ggplotly(p)
It appears that Ca/Mg data from the City of Kenai Docks site is potentially suspect, or some different hydrological process is occurring there. Reasons unknown. Marine influence likely?
table6_sites <- c("No_Name_Creek",
"Beaver_Creek",
"Slikok_Creek",
"Soldotna_Creek",
"Skilak_Lake_Outlet",
"Jims_Landing")
dat_tbl6 <- dat %>%
filter(location_description %in% table6_sites)
Let’s examine the range of our data again, just within the sites described in Table 6 of the draft DEC report:
dat_tbl6 %>%
ggplot(aes(date,val,color = parameter)) +
geom_point() +
facet_wrap(. ~ location_description)
Let’s visualize our data in a different way to see is any other anomalies exist.
Plot all sites with variable y-axes:
dat %>%
ggplot(aes(date,val,color = parameter)) +
geom_point() +
facet_wrap(. ~ location_description, scales = "free_y")
Notes:
It is beyond the scope of this current draft DEC report to diagnose (or potentially correct) these data if anomalies are genuine. Such work will be conducted at a later date in 2021 pending funding from the Bureau of Restoration grant proposal or other funded projects.
For purposes of generating the table of long-term average Ca/Mg at a limited set of sites, we do not need the City of Kenai Docks data anyways.
Examine two sites in greater detail:
Lower No Name Creek
Kenai City Docks
sites <- c("No_Name_Creek","City_of_Kenai_Docks")
# make for only < 2018???
# plot raw Ca and Mg values
dat %>%
filter(location_description %in% sites) %>%
ggplot(aes(date,val,color = parameter)) +
geom_point() +
facet_wrap(. ~ location_description, scales = "free_y") +
ggtitle("Ca and Mg Values 2014 - 2018")
# plot CCC values for Cu and Zn w/ all available data for odd sites
z <- dat %>%
filter(location_description %in% sites,
duplicate != "Y",
date < "2019-01-01") %>%
select(date,location_description,parameter,val,unit) %>%
pivot_wider(names_from = parameter, values_from = val) %>%
mutate(hardness = 2.487*Calcium + 4.119*Magnesium,
Cu_CCC = exp(0.8545*log(hardness) - 1.702) * 0.96,
Zn_CCC = exp(0.8473*log(hardness) - 0.884) * 0.986) %>%
select(-Calcium,-Magnesium,-hardness) %>%
pivot_longer(cols = c("Cu_CCC","Zn_CCC"), values_to = "CCC")
# plot CCC values for two sites
z %>%
ggplot(aes(date,CCC,color = name)) +
geom_point() +
facet_wrap(. ~ location_description, scales = "free_y") +
ggtitle("CCC Values for Cu and Zn, 2014 - 2018")
# summary table
z1 <- z %>%
group_by(location_description,name) %>%
summarise(mean_CCC = mean(CCC),
min_CCC = min(CCC),
max_CCC = max(CCC),
min_date = min(date),
max_date = max(date))
z1 %>%
datatable() %>%
formatRound(columns=c('mean_CCC',
'min_CCC',
'max_CCC'), digits=3)
Using the data discussed above, we will generate a summary of hardness values for all sites.
Assign sample events to either “Spring” or “Summer”
What unique months are included in the data set?
# assign sample events to either "Spring" (month = 4 or 5) or "Summer" (month = 7)
dat <- dat %>%
mutate(month = month(date))
(z <- unique(dat$month))
## [1] 4 7 5
# assign seasons
dat <- dat %>%
mutate(season = ifelse(month %in% c("4","5"),"spring","summer"))
Modify site names to conform with data structure in DEC report (e.g. “Lower” and “Upper” creek sites).
All sites from data that was housed in the KWF server is from the “lower” sections. Assign site names as such.
Prepare to calculate hardness values
Calculate hardness values by site and biannual sample event
# hardness 2000 - 2018
hardness <- dat %>%
mutate(year = year(date)) %>%
filter(year < 2019) %>%
filter(!is.na(val)) %>%
group_by(year,season,river_mile,waterbody,location_description,parameter) %>%
summarise(mean_val = mean(val),
n = n()) %>%
arrange(river_mile) %>%
# arrange to wider format
pivot_wider(names_from = parameter, values_from = mean_val) %>%
# calculate hardness values
# forula source: draft DEC report in "Clesceri, L.S., Greenberg, A.E., Eaton, A.D. (Eds.). 1998. Standard Methods for the Examination of Water and Wastewater (20th ed.), Washington D.C. American Public Health Association, American Water Works Association, and Water Environment Federation."
mutate(hardness = 2.497*Calcium + 4.119*Magnesium) %>%
select(-Calcium,-Magnesium,-n)
Summarise hardness values
hardness %>%
ungroup() %>%
#filter(location_description %in% table6_sites) %>%
group_by(season,river_mile,waterbody,location_description) %>%
filter(!is.na(hardness)) %>%
summarise(n = n(),
mean_hardness = sprintf("%0.2f", mean(hardness)),
std_error = sprintf("%0.2f", std.error(hardness))) %>%
mutate("Hardness (mg/L) (Mean ± Std. Error)" = paste(mean_hardness,"±",std_error)) %>%
select(-mean_hardness,-std_error)
Plot hardness values
# reorder site by river mile
hardness$location_description <- reorder(hardness$location_description, hardness$river_mile)
# mainstem plot
(p1 <- hardness %>%
filter(waterbody == "Mainstem") %>%
rename("Season" = "season") %>%
ggplot(aes(location_description,hardness, color = Season)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_jitter(position = position_jitterdodge()) +
facet_wrap(waterbody ~ ., scales = "free") +
xlab("") +
ylab("Hardness") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold")) )
# tributary plot
(p2 <- hardness %>%
filter(waterbody == "Tributary") %>%
rename("Season" = "season") %>%
ggplot(aes(location_description,hardness, color = Season)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_jitter(position = position_jitterdodge()) +
facet_wrap(waterbody ~ ., scales = "free") +
xlab("") +
ylab("Hardness") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold")))
# arrange plots
ggarrange(p1, p2,ncol = 1, nrow = 2,
common.legend = T,
legend = "bottom") %>%
annotate_figure(top = text_grob("2000 - 2018 Hardness Values\nKenai River Mainstem and Tributary Sites"))
# export figure
ggsave("output/figures/hardness_2000_2018.png",height = 10, width = 6)
Plot 2019 - 2020 asynchronous hardness values in context of long term data
generate hardness values…
address duplicates… assign each event to spring or summer sample event..